In [1]:
#let us start by importing the relevant libraries

%matplotlib inline
import warnings
import seaborn as sns
warnings.filterwarnings('ignore')
#import the necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score,confusion_matrix, classification_report,roc_auc_score
from scipy.stats import zscore
from sklearn.model_selection import train_test_split
In [2]:
##Read the data as a dataframe
df = pd.read_csv('vehicle.csv') 
In [3]:
#Perform basic EDA
In [4]:
df.head(20)
Out[4]:
compactness circularity distance_circularity radius_ratio pr.axis_aspect_ratio max.length_aspect_ratio scatter_ratio elongatedness pr.axis_rectangularity max.length_rectangularity scaled_variance scaled_variance.1 scaled_radius_of_gyration scaled_radius_of_gyration.1 skewness_about skewness_about.1 skewness_about.2 hollows_ratio class
0 95 48.0 83.0 178.0 72.0 10 162.0 42.0 20.0 159 176.0 379.0 184.0 70.0 6.0 16.0 187.0 197 van
1 91 41.0 84.0 141.0 57.0 9 149.0 45.0 19.0 143 170.0 330.0 158.0 72.0 9.0 14.0 189.0 199 van
2 104 50.0 106.0 209.0 66.0 10 207.0 32.0 23.0 158 223.0 635.0 220.0 73.0 14.0 9.0 188.0 196 car
3 93 41.0 82.0 159.0 63.0 9 144.0 46.0 19.0 143 160.0 309.0 127.0 63.0 6.0 10.0 199.0 207 van
4 85 44.0 70.0 205.0 103.0 52 149.0 45.0 19.0 144 241.0 325.0 188.0 127.0 9.0 11.0 180.0 183 bus
5 107 NaN 106.0 172.0 50.0 6 255.0 26.0 28.0 169 280.0 957.0 264.0 85.0 5.0 9.0 181.0 183 bus
6 97 43.0 73.0 173.0 65.0 6 153.0 42.0 19.0 143 176.0 361.0 172.0 66.0 13.0 1.0 200.0 204 bus
7 90 43.0 66.0 157.0 65.0 9 137.0 48.0 18.0 146 162.0 281.0 164.0 67.0 3.0 3.0 193.0 202 van
8 86 34.0 62.0 140.0 61.0 7 122.0 54.0 17.0 127 141.0 223.0 112.0 64.0 2.0 14.0 200.0 208 van
9 93 44.0 98.0 NaN 62.0 11 183.0 36.0 22.0 146 202.0 505.0 152.0 64.0 4.0 14.0 195.0 204 car
10 86 36.0 70.0 143.0 61.0 9 133.0 50.0 18.0 130 153.0 266.0 127.0 66.0 2.0 10.0 194.0 202 van
11 90 34.0 66.0 136.0 55.0 6 123.0 54.0 17.0 118 148.0 224.0 118.0 65.0 5.0 26.0 196.0 202 car
12 88 46.0 74.0 171.0 68.0 6 152.0 43.0 19.0 148 180.0 349.0 192.0 71.0 5.0 11.0 189.0 195 bus
13 89 42.0 85.0 144.0 58.0 10 152.0 44.0 19.0 144 173.0 345.0 161.0 72.0 8.0 13.0 187.0 197 van
14 94 49.0 79.0 203.0 71.0 5 174.0 37.0 21.0 154 196.0 465.0 206.0 71.0 6.0 2.0 197.0 199 bus
15 96 55.0 103.0 201.0 65.0 9 204.0 32.0 23.0 166 227.0 624.0 246.0 74.0 6.0 2.0 186.0 194 car
16 89 36.0 51.0 109.0 52.0 6 118.0 57.0 17.0 129 137.0 206.0 125.0 80.0 2.0 14.0 181.0 185 van
17 99 41.0 77.0 197.0 69.0 6 177.0 36.0 21.0 139 202.0 485.0 151.0 72.0 4.0 10.0 198.0 199 bus
18 104 54.0 100.0 186.0 61.0 10 216.0 31.0 24.0 173 225.0 686.0 220.0 74.0 5.0 11.0 185.0 195 car
19 101 56.0 100.0 215.0 NaN 10 208.0 32.0 24.0 169 227.0 651.0 223.0 74.0 6.0 5.0 186.0 193 car
In [5]:
##Data type of each attribute
df.dtypes
Out[5]:
compactness                      int64
circularity                    float64
distance_circularity           float64
radius_ratio                   float64
pr.axis_aspect_ratio           float64
max.length_aspect_ratio          int64
scatter_ratio                  float64
elongatedness                  float64
pr.axis_rectangularity         float64
max.length_rectangularity        int64
scaled_variance                float64
scaled_variance.1              float64
scaled_radius_of_gyration      float64
scaled_radius_of_gyration.1    float64
skewness_about                 float64
skewness_about.1               float64
skewness_about.2               float64
hollows_ratio                    int64
class                           object
dtype: object
In [6]:
## Shape of the data
df.shape
Out[6]:
(846, 19)
In [7]:
##Checking the presence of missing values
In [8]:
def missing_check(df):
    total = df.isnull().sum().sort_values(ascending=False)
    percent = (df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)
    missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    return missing_data
missing_check(df)
Out[8]:
Total Percent
radius_ratio 6 0.007092
skewness_about 6 0.007092
circularity 5 0.005910
scaled_radius_of_gyration.1 4 0.004728
distance_circularity 4 0.004728
scaled_variance 3 0.003546
pr.axis_rectangularity 3 0.003546
scaled_radius_of_gyration 2 0.002364
scaled_variance.1 2 0.002364
pr.axis_aspect_ratio 2 0.002364
skewness_about.2 1 0.001182
skewness_about.1 1 0.001182
elongatedness 1 0.001182
scatter_ratio 1 0.001182
class 0 0.000000
max.length_rectangularity 0 0.000000
max.length_aspect_ratio 0 0.000000
hollows_ratio 0 0.000000
compactness 0 0.000000
In [9]:
##5 Point summary of numerical attributes
df.describe()
Out[9]:
compactness circularity distance_circularity radius_ratio pr.axis_aspect_ratio max.length_aspect_ratio scatter_ratio elongatedness pr.axis_rectangularity max.length_rectangularity scaled_variance scaled_variance.1 scaled_radius_of_gyration scaled_radius_of_gyration.1 skewness_about skewness_about.1 skewness_about.2 hollows_ratio
count 846.000000 841.000000 842.000000 840.000000 844.000000 846.000000 845.000000 845.000000 843.000000 846.000000 843.000000 844.000000 844.000000 842.000000 840.000000 845.000000 845.000000 846.000000
mean 93.678487 44.828775 82.110451 168.888095 61.678910 8.567376 168.901775 40.933728 20.582444 147.998818 188.631079 439.494076 174.709716 72.447743 6.364286 12.602367 188.919527 195.632388
std 8.234474 6.152172 15.778292 33.520198 7.891463 4.601217 33.214848 7.816186 2.592933 14.515652 31.411004 176.666903 32.584808 7.486190 4.920649 8.936081 6.155809 7.438797
min 73.000000 33.000000 40.000000 104.000000 47.000000 2.000000 112.000000 26.000000 17.000000 118.000000 130.000000 184.000000 109.000000 59.000000 0.000000 0.000000 176.000000 181.000000
25% 87.000000 40.000000 70.000000 141.000000 57.000000 7.000000 147.000000 33.000000 19.000000 137.000000 167.000000 318.000000 149.000000 67.000000 2.000000 5.000000 184.000000 190.250000
50% 93.000000 44.000000 80.000000 167.000000 61.000000 8.000000 157.000000 43.000000 20.000000 146.000000 179.000000 363.500000 173.500000 71.500000 6.000000 11.000000 188.000000 197.000000
75% 100.000000 49.000000 98.000000 195.000000 65.000000 10.000000 198.000000 46.000000 23.000000 159.000000 217.000000 587.000000 198.000000 75.000000 9.000000 19.000000 193.000000 201.000000
max 119.000000 59.000000 112.000000 333.000000 138.000000 55.000000 265.000000 61.000000 29.000000 188.000000 320.000000 1018.000000 268.000000 135.000000 22.000000 41.000000 206.000000 211.000000
In [10]:
df.skew()
Out[10]:
compactness                    0.381271
circularity                    0.261809
distance_circularity           0.106585
radius_ratio                   0.394978
pr.axis_aspect_ratio           3.830362
max.length_aspect_ratio        6.778394
scatter_ratio                  0.607271
elongatedness                  0.047847
pr.axis_rectangularity         0.770889
max.length_rectangularity      0.256359
scaled_variance                0.651598
scaled_variance.1              0.842034
scaled_radius_of_gyration      0.279317
scaled_radius_of_gyration.1    2.083496
skewness_about                 0.776519
skewness_about.1               0.688017
skewness_about.2               0.249321
hollows_ratio                 -0.226341
dtype: float64
In [11]:
##Let's treat data for missing values first and then we can see the outliers
In [12]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder 
le = LabelEncoder() 
columns = df.columns
#Let's Label Encode our class variable: 
print(columns)
df['class'] = le.fit_transform(df['class'])
df.shape
Index(['compactness', 'circularity', 'distance_circularity', 'radius_ratio',
       'pr.axis_aspect_ratio', 'max.length_aspect_ratio', 'scatter_ratio',
       'elongatedness', 'pr.axis_rectangularity', 'max.length_rectangularity',
       'scaled_variance', 'scaled_variance.1', 'scaled_radius_of_gyration',
       'scaled_radius_of_gyration.1', 'skewness_about', 'skewness_about.1',
       'skewness_about.2', 'hollows_ratio', 'class'],
      dtype='object')
Out[12]:
(846, 19)
In [13]:
from sklearn.impute import SimpleImputer
newdf = df.copy()
X = newdf.iloc[:,0:19] #separting all numercial independent attribute
imputer = SimpleImputer(missing_values=np.nan, strategy='median', verbose=1)
#fill missing values with mean column values
transformed_values = imputer.fit_transform(X)
column = X.columns
print(column)
newdf = pd.DataFrame(transformed_values, columns = column)
newdf.describe()
Index(['compactness', 'circularity', 'distance_circularity', 'radius_ratio',
       'pr.axis_aspect_ratio', 'max.length_aspect_ratio', 'scatter_ratio',
       'elongatedness', 'pr.axis_rectangularity', 'max.length_rectangularity',
       'scaled_variance', 'scaled_variance.1', 'scaled_radius_of_gyration',
       'scaled_radius_of_gyration.1', 'skewness_about', 'skewness_about.1',
       'skewness_about.2', 'hollows_ratio', 'class'],
      dtype='object')
Out[13]:
compactness circularity distance_circularity radius_ratio pr.axis_aspect_ratio max.length_aspect_ratio scatter_ratio elongatedness pr.axis_rectangularity max.length_rectangularity scaled_variance scaled_variance.1 scaled_radius_of_gyration scaled_radius_of_gyration.1 skewness_about skewness_about.1 skewness_about.2 hollows_ratio class
count 846.000000 846.000000 846.000000 846.000000 846.000000 846.000000 846.000000 846.000000 846.000000 846.000000 846.000000 846.000000 846.000000 846.000000 846.000000 846.000000 846.000000 846.000000 846.000000
mean 93.678487 44.823877 82.100473 168.874704 61.677305 8.567376 168.887707 40.936170 20.580378 147.998818 188.596927 439.314421 174.706856 72.443262 6.361702 12.600473 188.918440 195.632388 0.977541
std 8.234474 6.134272 15.741569 33.401356 7.882188 4.601217 33.197710 7.811882 2.588558 14.515652 31.360427 176.496341 32.546277 7.468734 4.903244 8.930962 6.152247 7.438797 0.702130
min 73.000000 33.000000 40.000000 104.000000 47.000000 2.000000 112.000000 26.000000 17.000000 118.000000 130.000000 184.000000 109.000000 59.000000 0.000000 0.000000 176.000000 181.000000 0.000000
25% 87.000000 40.000000 70.000000 141.000000 57.000000 7.000000 147.000000 33.000000 19.000000 137.000000 167.000000 318.250000 149.000000 67.000000 2.000000 5.000000 184.000000 190.250000 0.000000
50% 93.000000 44.000000 80.000000 167.000000 61.000000 8.000000 157.000000 43.000000 20.000000 146.000000 179.000000 363.500000 173.500000 71.500000 6.000000 11.000000 188.000000 197.000000 1.000000
75% 100.000000 49.000000 98.000000 195.000000 65.000000 10.000000 198.000000 46.000000 23.000000 159.000000 217.000000 586.750000 198.000000 75.000000 9.000000 19.000000 193.000000 201.000000 1.000000
max 119.000000 59.000000 112.000000 333.000000 138.000000 55.000000 265.000000 61.000000 29.000000 188.000000 320.000000 1018.000000 268.000000 135.000000 22.000000 41.000000 206.000000 211.000000 2.000000
In [14]:
missing_check(newdf)
Out[14]:
Total Percent
class 0 0.0
pr.axis_rectangularity 0 0.0
circularity 0 0.0
distance_circularity 0 0.0
radius_ratio 0 0.0
pr.axis_aspect_ratio 0 0.0
max.length_aspect_ratio 0 0.0
scatter_ratio 0 0.0
elongatedness 0 0.0
max.length_rectangularity 0 0.0
hollows_ratio 0 0.0
scaled_variance 0 0.0
scaled_variance.1 0 0.0
scaled_radius_of_gyration 0 0.0
scaled_radius_of_gyration.1 0 0.0
skewness_about 0 0.0
skewness_about.1 0 0.0
skewness_about.2 0 0.0
compactness 0 0.0
In [15]:
newdf.describe().T
Out[15]:
count mean std min 25% 50% 75% max
compactness 846.0 93.678487 8.234474 73.0 87.00 93.0 100.00 119.0
circularity 846.0 44.823877 6.134272 33.0 40.00 44.0 49.00 59.0
distance_circularity 846.0 82.100473 15.741569 40.0 70.00 80.0 98.00 112.0
radius_ratio 846.0 168.874704 33.401356 104.0 141.00 167.0 195.00 333.0
pr.axis_aspect_ratio 846.0 61.677305 7.882188 47.0 57.00 61.0 65.00 138.0
max.length_aspect_ratio 846.0 8.567376 4.601217 2.0 7.00 8.0 10.00 55.0
scatter_ratio 846.0 168.887707 33.197710 112.0 147.00 157.0 198.00 265.0
elongatedness 846.0 40.936170 7.811882 26.0 33.00 43.0 46.00 61.0
pr.axis_rectangularity 846.0 20.580378 2.588558 17.0 19.00 20.0 23.00 29.0
max.length_rectangularity 846.0 147.998818 14.515652 118.0 137.00 146.0 159.00 188.0
scaled_variance 846.0 188.596927 31.360427 130.0 167.00 179.0 217.00 320.0
scaled_variance.1 846.0 439.314421 176.496341 184.0 318.25 363.5 586.75 1018.0
scaled_radius_of_gyration 846.0 174.706856 32.546277 109.0 149.00 173.5 198.00 268.0
scaled_radius_of_gyration.1 846.0 72.443262 7.468734 59.0 67.00 71.5 75.00 135.0
skewness_about 846.0 6.361702 4.903244 0.0 2.00 6.0 9.00 22.0
skewness_about.1 846.0 12.600473 8.930962 0.0 5.00 11.0 19.00 41.0
skewness_about.2 846.0 188.918440 6.152247 176.0 184.00 188.0 193.00 206.0
hollows_ratio 846.0 195.632388 7.438797 181.0 190.25 197.0 201.00 211.0
class 846.0 0.977541 0.702130 0.0 0.00 1.0 1.00 2.0
In [16]:
##we got rid of all missing values let's see outliers 
##Checking the presence of outliers and distribution
In [17]:
plt.style.use('seaborn-whitegrid')

newdf.hist(bins=20, figsize=(60,40), color='green', edgecolor = 'red')
plt.show()
In [18]:
## Observation
# Most of the data attributes seems to be normally distributed
# scaled valriance 1 and skewness about 1 and 2, scatter_ratio, seems to be right skewed.
# pr.axis_rectangularity seems to be haing outliers as there are some gaps found in the bar plot.
In [19]:
skewValue = newdf.skew()
print("skewValue of dataframe attributes: ", skewValue)
skewValue of dataframe attributes:  compactness                    0.381271
circularity                    0.264928
distance_circularity           0.108718
radius_ratio                   0.397572
pr.axis_aspect_ratio           3.835392
max.length_aspect_ratio        6.778394
scatter_ratio                  0.608710
elongatedness                  0.046951
pr.axis_rectangularity         0.774406
max.length_rectangularity      0.256359
scaled_variance                0.655598
scaled_variance.1              0.845345
scaled_radius_of_gyration      0.279910
scaled_radius_of_gyration.1    2.089979
skewness_about                 0.780813
skewness_about.1               0.689014
skewness_about.2               0.249985
hollows_ratio                 -0.226341
class                          0.031106
dtype: float64
In [20]:
#univariant analysis using boxplot 
sns.boxplot(data=newdf, orient="h")
Out[20]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a1dd8d2d0>
In [21]:
newdf.boxplot(column=['pr.axis_aspect_ratio','skewness_about','scaled_variance'], figsize=(30,10))
Out[21]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a19960d90>
In [22]:
newdf.boxplot(column=['radius_ratio','scaled_radius_of_gyration.1','scaled_variance.1'], figsize=(30,10))
Out[22]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a1dd852d0>
In [23]:
newdf.boxplot(column=['max.length_aspect_ratio','skewness_about.1'], figsize=(30,10))
Out[23]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a1ddc3dd0>
In [24]:
## Observation
# pr.axis_aspect_ratio, skewness_about, max_length_aspect_ratio, skewness_about_1,
# scaled_radius_of_gyration.1, scaled_variance.1, radius_ratio, skewness_about, scaled_variance.1 
# are some of the attributes with outliers.
In [25]:
# Let's start Treating Outliers Using IQR: Upper whisker
In [26]:
newdf.shape
Out[26]:
(846, 19)
In [27]:
from scipy.stats import iqr
Q1 = newdf.quantile(0.25)
Q3 = newdf.quantile(0.75)
IQR = Q3 - Q1
print(IQR)
compactness                     13.00
circularity                      9.00
distance_circularity            28.00
radius_ratio                    54.00
pr.axis_aspect_ratio             8.00
max.length_aspect_ratio          3.00
scatter_ratio                   51.00
elongatedness                   13.00
pr.axis_rectangularity           4.00
max.length_rectangularity       22.00
scaled_variance                 50.00
scaled_variance.1              268.50
scaled_radius_of_gyration       49.00
scaled_radius_of_gyration.1      8.00
skewness_about                   7.00
skewness_about.1                14.00
skewness_about.2                 9.00
hollows_ratio                   10.75
class                            1.00
dtype: float64
In [28]:
np.where((newdf < (Q1 - 1.5 * IQR)) | (newdf > (Q3 + 1.5 * IQR)))
Out[28]:
(array([  4,   4,   4,  37,  37,  37,  37,  44,  47,  79,  85, 100, 100,
        100, 113, 123, 127, 132, 135, 135, 135, 135, 190, 230, 291, 291,
        291, 346, 381, 388, 388, 388, 388, 388, 391, 400, 498, 505, 516,
        523, 523, 523, 544, 623, 655, 655, 706, 706, 706, 761, 796, 797,
        815, 815, 835]),
 array([ 4,  5, 13,  3,  4,  5, 13, 14, 13, 13, 11,  4,  5, 13, 14, 14,  5,
        15,  3,  4,  5, 13, 14, 13,  4,  5, 13, 14, 13,  3,  4,  5, 10, 13,
         5, 14, 13, 14, 14,  4,  5, 13,  5, 14,  5, 13,  4,  5, 13, 14, 14,
        14,  5, 13, 11]))
In [29]:
# we can use previously calculated IQR score to filter out the outliers by keeping only valid values.
In [30]:
newdf2 = newdf[~((newdf < (Q1 - 1.5 * IQR)) |(newdf > (Q3 + 1.5 * IQR))).any(axis=1)] # rows without outliers
newdf2.shape
Out[30]:
(813, 19)
In [31]:
# let's check outliers are removed by boxplot
In [32]:
newdf2.boxplot(column=['pr.axis_aspect_ratio','skewness_about', 'scaled_variance','radius_ratio', 'scaled_radius_of_gyration.1',
                     'scaled_variance.1', 'max.length_aspect_ratio', 'skewness_about.1'], figsize=(30,10))
Out[32]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a1dcdd950>
In [33]:
#from above I can see only one outlier in scaled_variance.1 
#that I can be considered because it's almost on the tip of the boxplot
In [34]:
## Let's undertsatnd the relationship between independent variables
In [35]:
def correlation_heatmap(dataframe,l,w):
    #correlations = dataframe.corr()
    correlation = dataframe.corr()
    plt.figure(figsize=(l,w))
    sns.heatmap(correlation, vmax=1, square=True,annot=True,cmap='viridis')
    plt.title('Correlation between different fearures')
    plt.show();
    
# Let's Drop Class column and see the correlation Matrix & Pairplot Before using this dataframe for PCA as PCA should only be perfromed on independent attribute
cleandf= newdf2.drop('class', axis=1)
#print("After Dropping: ", cleandf)
correlation_heatmap(newdf2, 30,15)
In [36]:
# Strong relation
#       - Scaled Variance & Scaled Variance.1 seems to be strongly correlated with value of 0.98
#       - skewness_about_2 and hollow_ratio seems to be strongly correlated, coeff: 0.89
#       - ditance_circularity and radius_ratio seems to have high positive correlation with corr coeff: 0.81
#       - compactness & circularity , radius_ratio & pr.axis_aspect_ratio also seems ver averagely correlated with coeff: 0.67.
#       - scaled _variance and scaled_radius_of_gyration, circularity & distance_circularity also seems to be highly correlated with corr coeff: 0.79
#       - pr.axis_recatngularity and max.length_recatngularity also seems to be strongly correlated with coeff: 0.81 
#       - scatter_ratio and elongatedness seems to be have strong negative correlation val : 0.97
#       - elongatedness and pr.axis_rectangularity seems to have strong negative correlation, val:  0.95
In [37]:
#No/little relation
#       -max_length_aspect_ratio & radius_ratio have average correlation with coeff: 0.46
#       - pr.axis_aspect_ratio & max_length_aspect_ratio seems to have very little correlation
#       - scaled_radius_gyration & scaled_radisu_gyration.1 seems to be very little correlated
#       - scaled_radius_gyration.1 & skewness_about seems to be very little correlated
#       - skewness_about & skewness_about.1 not be correlated
#       - skewness_about.1 and skewness_about.2 are not correlated
In [38]:
sns.pairplot(newdf2, diag_kind="kde")
Out[38]:
<seaborn.axisgrid.PairGrid at 0x1a1ab00750>
In [39]:
# From above correlation matrix we can see that there are many features which are highly correlated. 
#we will find that many features are there which having more than 0.9 correlation. 
#so we can decide to get rid of those columns whose correlation is +-0.9 or above.There are 8 such columns:

# max.length_rectangularity
# scaled_radius_of_gyration
# skewness_about.2
# scatter_ratio
# elongatedness
# pr.axis_rectangularity
# scaled_variance
# scaled_variance.1
In [40]:
## use a dimension reduction algorithm such as Principle Component Analysis (PCA). 
# We will go for PCA and analyse the same going forward
In [41]:
#Let's choose the right variable
In [42]:
#display how many are car,bus,van. 
newdf2['class'].value_counts()
sns.countplot(newdf2['class'])
plt.show()
In [43]:
#. Split the data into train and test 
In [44]:
X = newdf2.iloc[:,0:18].values
y = newdf2.iloc[:,18].values
X
Out[44]:
array([[ 95.,  48.,  83., ...,  16., 187., 197.],
       [ 91.,  41.,  84., ...,  14., 189., 199.],
       [104.,  50., 106., ...,   9., 188., 196.],
       ...,
       [106.,  54., 101., ...,   4., 187., 201.],
       [ 86.,  36.,  78., ...,  25., 190., 195.],
       [ 85.,  36.,  66., ...,  18., 186., 190.]])
In [45]:
# We transform (centralize) the entire X (independent variable data) to zscores through transformation. We will create the PCA dimensions
# on this distribution. 
In [46]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_std =  sc.fit_transform(X)          
cov_matrix = np.cov(X_std.T)
print('Covariance Matrix \n%s', cov_matrix)
Covariance Matrix 
%s [[ 1.00123153e+00  6.80164027e-01  7.87792814e-01  7.46906930e-01
   2.00881439e-01  4.98273207e-01  8.11840645e-01 -7.89531434e-01
   8.12866245e-01  6.74996601e-01  7.92438680e-01  8.13494150e-01
   5.78399755e-01 -2.53990635e-01  2.00887113e-01  1.61304844e-01
   2.95777412e-01  3.64608943e-01]
 [ 6.80164027e-01  1.00123153e+00  7.87747162e-01  6.41725205e-01
   2.06409699e-01  5.64854067e-01  8.44804611e-01 -8.16768295e-01
   8.41196310e-01  9.62404205e-01  8.03750964e-01  8.33508154e-01
   9.26281607e-01  6.67790806e-02  1.40563881e-01 -1.43598307e-02
  -1.16976151e-01  3.92302597e-02]
 [ 7.87792814e-01  7.87747162e-01  1.00123153e+00  8.09326627e-01
   2.45756551e-01  6.69657073e-01  9.06692225e-01 -9.09806087e-01
   8.95884623e-01  7.69635504e-01  8.85221631e-01  8.89286924e-01
   7.03348558e-01 -2.38231284e-01  9.89345733e-02  2.63832735e-01
   1.29070982e-01  3.22051625e-01]
 [ 7.46906930e-01  6.41725205e-01  8.09326627e-01  1.00123153e+00
   6.67029240e-01  4.61258592e-01  7.90495472e-01 -8.45064567e-01
   7.64769672e-01  5.77501217e-01  7.93778346e-01  7.77097647e-01
   5.51222677e-01 -4.03672885e-01  4.03555670e-02  1.87420711e-01
   4.18869167e-01  5.05314324e-01]
 [ 2.00881439e-01  2.06409699e-01  2.45756551e-01  6.67029240e-01
   1.00123153e+00  1.38431761e-01  2.00217560e-01 -3.02289321e-01
   1.69961019e-01  1.46036511e-01  2.15074904e-01  1.86526180e-01
   1.53697623e-01 -3.25502385e-01 -5.16026240e-02 -2.86185855e-02
   4.06792617e-01  4.20318003e-01]
 [ 4.98273207e-01  5.64854067e-01  6.69657073e-01  4.61258592e-01
   1.38431761e-01  1.00123153e+00  4.98078976e-01 -5.02996017e-01
   4.97845069e-01  6.48642021e-01  4.12068816e-01  4.58456162e-01
   4.04786322e-01 -3.33161873e-01  8.41082601e-02  1.41145578e-01
   5.64852182e-02  3.94934461e-01]
 [ 8.11840645e-01  8.44804611e-01  9.06692225e-01  7.90495472e-01
   2.00217560e-01  4.98078976e-01  1.00123153e+00 -9.73537513e-01
   9.90659730e-01  8.08063766e-01  9.78751548e-01  9.94204811e-01
   7.95893849e-01  2.44702588e-03  6.35490363e-02  2.14445853e-01
  -3.10409338e-03  1.16323654e-01]
 [-7.89531434e-01 -8.16768295e-01 -9.09806087e-01 -8.45064567e-01
  -3.02289321e-01 -5.02996017e-01 -9.73537513e-01  1.00123153e+00
  -9.51112661e-01 -7.70982661e-01 -9.66090990e-01 -9.56973892e-01
  -7.63345981e-01  8.70842667e-02 -4.55135596e-02 -1.84181395e-01
  -1.05393355e-01 -2.11345600e-01]
 [ 8.12866245e-01  8.41196310e-01  8.95884623e-01  7.64769672e-01
   1.69961019e-01  4.97845069e-01  9.90659730e-01 -9.51112661e-01
   1.00123153e+00  8.11346565e-01  9.64981168e-01  9.88989478e-01
   7.93172901e-01  1.77904437e-02  7.28156271e-02  2.16892797e-01
  -2.65026808e-02  9.80719286e-02]
 [ 6.74996601e-01  9.62404205e-01  7.69635504e-01  5.77501217e-01
   1.46036511e-01  6.48642021e-01  8.08063766e-01 -7.70982661e-01
   8.11346565e-01  1.00123153e+00  7.50600479e-01  7.95049173e-01
   8.68007898e-01  5.26495142e-02  1.34795631e-01 -2.44448372e-03
  -1.17812145e-01  6.72596198e-02]
 [ 7.92438680e-01  8.03750964e-01  8.85221631e-01  7.93778346e-01
   2.15074904e-01  4.12068816e-01  9.78751548e-01 -9.66090990e-01
   9.64981168e-01  7.50600479e-01  1.00123153e+00  9.76750881e-01
   7.81984129e-01  1.68621531e-02  3.39888849e-02  2.05971428e-01
   2.28035846e-02  9.60435931e-02]
 [ 8.13494150e-01  8.33508154e-01  8.89286924e-01  7.77097647e-01
   1.86526180e-01  4.58456162e-01  9.94204811e-01 -9.56973892e-01
   9.88989478e-01  7.95049173e-01  9.76750881e-01  1.00123153e+00
   7.90805725e-01  1.62348310e-02  6.49567636e-02  2.03838067e-01
   7.85566308e-05  1.03330899e-01]
 [ 5.78399755e-01  9.26281607e-01  7.03348558e-01  5.51222677e-01
   1.53697623e-01  4.04786322e-01  7.95893849e-01 -7.63345981e-01
   7.93172901e-01  8.68007898e-01  7.81984129e-01  7.90805725e-01
   1.00123153e+00  2.16651698e-01  1.68973862e-01 -5.83635746e-02
  -2.32617810e-01 -1.20727281e-01]
 [-2.53990635e-01  6.67790806e-02 -2.38231284e-01 -4.03672885e-01
  -3.25502385e-01 -3.33161873e-01  2.44702588e-03  8.70842667e-02
   1.77904437e-02  5.26495142e-02  1.68621531e-02  1.62348310e-02
   2.16651698e-01  1.00123153e+00 -5.93373719e-02 -1.31142620e-01
  -8.43627948e-01 -9.18420730e-01]
 [ 2.00887113e-01  1.40563881e-01  9.89345733e-02  4.03555670e-02
  -5.16026240e-02  8.41082601e-02  6.35490363e-02 -4.55135596e-02
   7.28156271e-02  1.34795631e-01  3.39888849e-02  6.49567636e-02
   1.68973862e-01 -5.93373719e-02  1.00123153e+00 -4.53538836e-02
   8.48972195e-02  6.12111362e-02]
 [ 1.61304844e-01 -1.43598307e-02  2.63832735e-01  1.87420711e-01
  -2.86185855e-02  1.41145578e-01  2.14445853e-01 -1.84181395e-01
   2.16892797e-01 -2.44448372e-03  2.05971428e-01  2.03838067e-01
  -5.83635746e-02 -1.31142620e-01 -4.53538836e-02  1.00123153e+00
   7.28908031e-02  2.00156475e-01]
 [ 2.95777412e-01 -1.16976151e-01  1.29070982e-01  4.18869167e-01
   4.06792617e-01  5.64852182e-02 -3.10409338e-03 -1.05393355e-01
  -2.65026808e-02 -1.17812145e-01  2.28035846e-02  7.85566308e-05
  -2.32617810e-01 -8.43627948e-01  8.48972195e-02  7.28908031e-02
   1.00123153e+00  8.91041674e-01]
 [ 3.64608943e-01  3.92302597e-02  3.22051625e-01  5.05314324e-01
   4.20318003e-01  3.94934461e-01  1.16323654e-01 -2.11345600e-01
   9.80719286e-02  6.72596198e-02  9.60435931e-02  1.03330899e-01
  -1.20727281e-01 -9.18420730e-01  6.12111362e-02  2.00156475e-01
   8.91041674e-01  1.00123153e+00]]
In [47]:
eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)
print('Eigen Vectors \n%s', eigenvectors)
print('\n Eigen Values \n%s', eigenvalues)
Eigen Vectors 
%s [[-2.72251046e-01 -8.97284818e-02  2.26045073e-02  1.30419032e-01
  -1.52324139e-01  2.58374578e-01 -1.88794221e-01 -7.71578238e-01
  -3.61784776e-01 -1.25233628e-01  2.92009470e-02  7.62442008e-04
  -1.06680587e-02  1.05983722e-02 -1.01407495e-01 -1.46326861e-01
  -3.81638532e-03  3.32992130e-03]
 [-2.85370045e-01  1.33173937e-01  2.10809943e-01 -2.06785531e-02
   1.39022591e-01 -6.88979940e-02  3.90871235e-01 -6.60528436e-02
  -4.62957583e-02  2.40262612e-01  7.29503235e-02  1.93799916e-01
  -7.74670931e-03 -8.71766559e-02 -3.11337823e-01  1.96463651e-01
  -2.96230720e-01  5.83996136e-01]
 [-3.01486231e-01 -4.40259591e-02 -7.08780817e-02  1.07425217e-01
   8.07335409e-02 -2.04800896e-02 -1.76384547e-01  2.98693883e-01
  -2.64499195e-01 -9.42971834e-02  7.78755026e-01 -2.32649049e-01
   1.11905744e-02  2.28724292e-02  5.89166755e-02  5.33931974e-02
   9.72735293e-02  8.64160083e-02]
 [-2.72594510e-01 -2.04232234e-01 -4.02139629e-02 -2.52957341e-01
  -1.19012554e-01 -1.39449676e-01 -1.56474448e-01  5.20410402e-02
  -1.70430331e-01  8.97062530e-02 -1.31647081e-01  2.75143903e-01
  -3.74689248e-02  2.90668794e-02 -2.04574984e-01  6.58916577e-01
   2.74900989e-01 -2.71300494e-01]
 [-9.85797647e-02 -2.59136858e-01  1.14805227e-01 -6.05228001e-01
  -8.32128223e-02 -5.87145492e-01 -1.02492950e-01 -1.61872497e-01
   1.17212341e-02  2.87528583e-02  4.97534613e-02 -1.45558629e-01
   2.09842091e-02 -9.40948646e-03  1.50893891e-01 -2.89610835e-01
  -1.19100067e-01  9.64017331e-02]
 [-1.94755787e-01 -9.45756320e-02  1.39313484e-01  3.22531411e-01
   6.21376071e-01 -2.65624695e-01 -3.98851794e-01 -5.85800952e-02
   1.73213170e-01 -2.49937617e-01 -1.98444456e-01  1.72600201e-01
  -1.06888298e-02  1.20980507e-02  1.76055013e-01  6.68511988e-02
  -2.92959443e-02  1.10841470e-01]
 [-3.10518442e-01  7.23350799e-02 -1.12924698e-01 -1.00540370e-02
  -8.12405608e-02  8.93335163e-02 -9.14237336e-02  8.45300921e-02
   1.37499298e-01  1.11244025e-01 -1.61642905e-01 -8.22439493e-02
   8.37148260e-01  2.72442207e-01 -1.51805844e-02 -7.66778803e-02
   5.60355480e-02  8.33248999e-02]
 [ 3.08438338e-01 -1.16876769e-02  9.00330455e-02  7.99117560e-02
   7.47379231e-02 -7.25853857e-02  1.04875746e-01 -2.16815347e-01
  -2.59988735e-01  1.24837047e-01 -4.29365477e-03 -3.50089602e-01
   2.42295907e-01  2.61394487e-03  4.61164909e-01  5.23226723e-01
  -2.65096114e-01 -1.36447171e-02]
 [-3.07548493e-01  8.40915278e-02 -1.11063547e-01  1.60464922e-02
  -7.75020996e-02  9.60554272e-02 -9.06723384e-02  3.37069994e-02
   1.03269951e-01  2.11468012e-01 -2.40841717e-01 -3.42527317e-01
  -9.86931593e-02 -6.84892390e-01  2.18872117e-01  2.39504315e-02
   2.70709305e-01  1.72817545e-01]
 [-2.76301073e-01  1.25836631e-01  2.19877688e-01  6.66507863e-02
   2.46140560e-01 -6.35014904e-02  3.49667685e-01 -2.26684736e-01
   2.44776407e-01  3.87473859e-01  2.24580349e-01  3.05154380e-02
  -1.40549391e-02  4.47385929e-02  1.53765067e-01 -1.04419937e-01
   1.53673085e-01 -5.43122947e-01]
 [-3.02748114e-01  7.01998575e-02 -1.44818765e-01 -6.98045095e-02
  -1.49584067e-01  1.34458896e-01 -7.54753072e-02  1.45772665e-01
   5.85239946e-02 -1.47036092e-01  2.06902072e-02  2.33368955e-01
   1.43866319e-02 -2.54510995e-01  1.79499013e-01  1.16604375e-02
  -7.26163025e-01 -3.24937516e-01]
 [-3.07040626e-01  7.79336637e-02 -1.15323952e-01 -1.73631584e-02
  -1.15117310e-01  1.26968672e-01 -6.99641470e-02  5.32611781e-02
   1.28904560e-01  1.60305310e-01 -1.96322990e-01 -2.75169550e-01
  -4.75672122e-01  6.13103868e-01  2.20362642e-01  7.99305617e-02
  -1.22815848e-01  1.42051799e-01]
 [-2.61520489e-01  2.09927277e-01  2.13627435e-01 -7.22457181e-02
   7.54871674e-03 -7.33961842e-02  4.55851958e-01  1.58194670e-01
  -3.37170589e-01 -5.87690102e-01 -2.58436921e-01 -1.07063554e-01
   8.61256926e-03  4.41891377e-02  1.43753708e-01 -5.21969873e-02
   1.69567965e-01 -8.32177228e-02]
 [ 4.36323635e-02  5.03914450e-01 -6.73920886e-02 -1.35860558e-01
  -1.40527774e-01 -1.31928871e-01 -7.90311042e-02 -3.00374428e-01
   5.01365221e-01 -3.87030017e-01  2.27875444e-01 -1.38958435e-01
   7.55464886e-03 -1.59765660e-02 -1.34656976e-01  3.04769192e-01
   5.39469506e-02  3.01217731e-02]
 [-3.67057041e-02 -1.45682524e-02  5.21623444e-01  4.90121679e-01
  -5.89800103e-01 -3.12415086e-01 -1.30187397e-01  1.14687509e-01
   7.50393829e-02  5.41502565e-02 -1.39861362e-02  5.61401152e-03
  -2.19811008e-03 -5.03222786e-03 -1.37166771e-02 -4.76724453e-03
  -3.27151282e-02 -2.14301813e-02]
 [-5.88504115e-02 -9.33980545e-02 -6.87170643e-01  3.80232477e-01
  -1.27793729e-01 -4.82506903e-01  3.10629290e-01 -1.18168951e-01
  -3.07213623e-02 -1.36044539e-02 -1.77010708e-02  8.59021362e-02
  -1.39575997e-02  1.10992435e-02  2.72433694e-02 -2.97178011e-02
   1.82173722e-02  1.83842486e-02]
 [-3.48373860e-02 -5.01664210e-01  6.22069465e-02 -3.55391597e-02
  -1.81582693e-01  2.75222340e-01  2.59557864e-01 -7.27008273e-02
   3.62122453e-01 -2.20343289e-01  1.73696003e-01  2.79657886e-01
   3.82401827e-02  7.76499049e-03  4.14581122e-01  1.14797284e-01
   1.66961820e-01  2.41026732e-01]
 [-8.28136172e-02 -5.06546563e-01  4.08035393e-02  1.03008417e-01
   1.11256244e-01  6.05771535e-02  1.76348774e-01  1.81034286e-02
   2.40710780e-01 -1.71416688e-01 -7.22825606e-02 -5.36171185e-01
   3.98716359e-03 -4.78049584e-02 -4.65683959e-01  8.53480643e-02
  -1.96223612e-01 -1.78387852e-01]]

 Eigen Values 
%s [9.79297570e+00 3.37710644e+00 1.20873054e+00 1.13659560e+00
 8.96286859e-01 6.58293128e-01 3.23056525e-01 2.26906613e-01
 1.12741686e-01 7.62069059e-02 6.18393099e-02 4.42420969e-02
 3.12610726e-03 1.01216098e-02 2.99919142e-02 2.67735138e-02
 1.77191935e-02 1.94537446e-02]
In [48]:
# Step 3 (continued): Sort eigenvalues in descending order

# Make a set of (eigenvalue, eigenvector) pairs
eig_pairs = [(eigenvalues[index], eigenvectors[:,index]) for index in range(len(eigenvalues))]

# Sort the (eigenvalue, eigenvector) pairs from highest to lowest with respect to eigenvalue
eig_pairs.sort()

eig_pairs.reverse()
print(eig_pairs)

# Extract the descending ordered eigenvalues and eigenvectors
eigvalues_sorted = [eig_pairs[index][0] for index in range(len(eigenvalues))]
eigvectors_sorted = [eig_pairs[index][1] for index in range(len(eigenvalues))]

# Let's confirm our sorting worked, print out eigenvalues
print('Eigenvalues in descending order: \n%s' %eigvalues_sorted)
[(9.792975698382946, array([-0.27225105, -0.28537005, -0.30148623, -0.27259451, -0.09857976,
       -0.19475579, -0.31051844,  0.30843834, -0.30754849, -0.27630107,
       -0.30274811, -0.30704063, -0.26152049,  0.04363236, -0.0367057 ,
       -0.05885041, -0.03483739, -0.08281362])), (3.377106439893973, array([-0.08972848,  0.13317394, -0.04402596, -0.20423223, -0.25913686,
       -0.09457563,  0.07233508, -0.01168768,  0.08409153,  0.12583663,
        0.07019986,  0.07793366,  0.20992728,  0.50391445, -0.01456825,
       -0.09339805, -0.50166421, -0.50654656])), (1.2087305396350991, array([ 0.02260451,  0.21080994, -0.07087808, -0.04021396,  0.11480523,
        0.13931348, -0.1129247 ,  0.09003305, -0.11106355,  0.21987769,
       -0.14481876, -0.11532395,  0.21362744, -0.06739209,  0.52162344,
       -0.68717064,  0.06220695,  0.04080354])), (1.1365956021766952, array([ 0.13041903, -0.02067855,  0.10742522, -0.25295734, -0.605228  ,
        0.32253141, -0.01005404,  0.07991176,  0.01604649,  0.06665079,
       -0.06980451, -0.01736316, -0.07224572, -0.13586056,  0.49012168,
        0.38023248, -0.03553916,  0.10300842])), (0.8962868592787947, array([-0.15232414,  0.13902259,  0.08073354, -0.11901255, -0.08321282,
        0.62137607, -0.08124056,  0.07473792, -0.0775021 ,  0.24614056,
       -0.14958407, -0.11511731,  0.00754872, -0.14052777, -0.5898001 ,
       -0.12779373, -0.18158269,  0.11125624])), (0.6582931281646526, array([ 0.25837458, -0.06889799, -0.02048009, -0.13944968, -0.58714549,
       -0.26562469,  0.08933352, -0.07258539,  0.09605543, -0.06350149,
        0.1344589 ,  0.12696867, -0.07339618, -0.13192887, -0.31241509,
       -0.4825069 ,  0.27522234,  0.06057715])), (0.32305652510792204, array([-0.18879422,  0.39087124, -0.17638455, -0.15647445, -0.10249295,
       -0.39885179, -0.09142373,  0.10487575, -0.09067234,  0.34966769,
       -0.07547531, -0.06996415,  0.45585196, -0.0790311 , -0.1301874 ,
        0.31062929,  0.25955786,  0.17634877])), (0.226906612823581, array([-0.77157824, -0.06605284,  0.29869388,  0.05204104, -0.1618725 ,
       -0.0585801 ,  0.08453009, -0.21681535,  0.033707  , -0.22668474,
        0.14577266,  0.05326118,  0.15819467, -0.30037443,  0.11468751,
       -0.11816895, -0.07270083,  0.01810343])), (0.11274168632338634, array([-0.36178478, -0.04629576, -0.2644992 , -0.17043033,  0.01172123,
        0.17321317,  0.1374993 , -0.25998873,  0.10326995,  0.24477641,
        0.05852399,  0.12890456, -0.33717059,  0.50136522,  0.07503938,
       -0.03072136,  0.36212245,  0.24071078])), (0.07620690593266875, array([-0.12523363,  0.24026261, -0.09429718,  0.08970625,  0.02875286,
       -0.24993762,  0.11124403,  0.12483705,  0.21146801,  0.38747386,
       -0.14703609,  0.16030531, -0.5876901 , -0.38703002,  0.05415026,
       -0.01360445, -0.22034329, -0.17141669])), (0.06183930986648124, array([ 0.02920095,  0.07295032,  0.77875503, -0.13164708,  0.04975346,
       -0.19844446, -0.16164291, -0.00429365, -0.24084172,  0.22458035,
        0.02069021, -0.19632299, -0.25843692,  0.22787544, -0.01398614,
       -0.01770107,  0.173696  , -0.07228256])), (0.044242096949759564, array([ 0.00076244,  0.19379992, -0.23264905,  0.2751439 , -0.14555863,
        0.1726002 , -0.08224395, -0.3500896 , -0.34252732,  0.03051544,
        0.23336895, -0.27516955, -0.10706355, -0.13895843,  0.00561401,
        0.08590214,  0.27965789, -0.53617119])), (0.029991914206113038, array([-0.10140749, -0.31133782,  0.05891668, -0.20457498,  0.15089389,
        0.17605501, -0.01518058,  0.46116491,  0.21887212,  0.15376507,
        0.17949901,  0.22036264,  0.14375371, -0.13465698, -0.01371668,
        0.02724337,  0.41458112, -0.46568396])), (0.026773513807314873, array([-0.14632686,  0.19646365,  0.0533932 ,  0.65891658, -0.28961083,
        0.0668512 , -0.07667788,  0.52322672,  0.02395043, -0.10441994,
        0.01166044,  0.07993056, -0.05219699,  0.30476919, -0.00476724,
       -0.0297178 ,  0.11479728,  0.08534806])), (0.019453744598141094, array([ 0.00332992,  0.58399614,  0.08641601, -0.27130049,  0.09640173,
        0.11084147,  0.0833249 , -0.01364472,  0.17281754, -0.54312295,
       -0.32493752,  0.1420518 , -0.08321772,  0.03012177, -0.02143018,
        0.01838425,  0.24102673, -0.17838785])), (0.017719193496813595, array([-0.00381639, -0.29623072,  0.09727353,  0.27490099, -0.11910007,
       -0.02929594,  0.05603555, -0.26509611,  0.2707093 ,  0.15367309,
       -0.72616302, -0.12281585,  0.16956796,  0.05394695, -0.03271513,
        0.01821737,  0.16696182, -0.19622361])), (0.010121609778869563, array([ 0.01059837, -0.08717666,  0.02287243,  0.02906688, -0.00940949,
        0.01209805,  0.27244221,  0.00261394, -0.68489239,  0.04473859,
       -0.25451099,  0.61310387,  0.04418914, -0.01597657, -0.00503223,
        0.01109924,  0.00776499, -0.04780496])), (0.0031261072615285062, array([-0.01066806, -0.00774671,  0.01119057, -0.03746892,  0.02098421,
       -0.01068883,  0.83714826,  0.24229591, -0.09869316, -0.01405494,
        0.01438663, -0.47567212,  0.00861257,  0.00755465, -0.00219811,
       -0.0139576 ,  0.03824018,  0.00398716]))]
Eigenvalues in descending order: 
[9.792975698382946, 3.377106439893973, 1.2087305396350991, 1.1365956021766952, 0.8962868592787947, 0.6582931281646526, 0.32305652510792204, 0.226906612823581, 0.11274168632338634, 0.07620690593266875, 0.06183930986648124, 0.044242096949759564, 0.029991914206113038, 0.026773513807314873, 0.019453744598141094, 0.017719193496813595, 0.010121609778869563, 0.0031261072615285062]
In [49]:
tot = sum(eigenvalues)
var_explained = [(i / tot) for i in sorted(eigenvalues, reverse=True)]  # an array of variance explained by each 
# eigen vector... there will be 8 entries as there are 8 eigen vectors)
cum_var_exp = np.cumsum(var_explained)  # an array of cumulative variance. There will be 8 entries with 8 th entry 
# cumulative reaching almost 100%
In [50]:
plt.bar(range(1,19), var_explained, alpha=0.5, align='center', label='individual explained variance')
plt.step(range(1,19),cum_var_exp, where= 'mid', label='cumulative explained variance')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal components')
plt.legend(loc = 'best')
plt.show()
In [51]:
# Observation:
# - From above plot we can clealry observer that 8 dimensions are able to explain 95 %variance of data. 
# - we will use 8 principal components going forward and calulate the reduced dimensions. 
In [52]:
# Dimensionality Reduction
# Now 8 dimensions seems very reasonable. 
#With 8 variables we can explain over 95% of the variation in the original data!
In [53]:
# P_reduce represents reduced mathematical space....

P_reduce = np.array(eigvectors_sorted[0:8])   # Reducing from 18 to 8 dimension space

X_std_8D = np.dot(X_std,P_reduce.T)   # projecting original data into principal component dimensions

Proj_data_df = pd.DataFrame(X_std_8D)  # converting array to dataframe for pairplot
Proj_data_df
Out[53]:
0 1 2 3 4 5 6 7
0 -0.591125 -0.655523 0.564477 -0.659870 0.855251 -1.835814 0.155983 -0.683144
1 1.524878 -0.327117 0.251528 1.296236 0.282463 -0.091649 -0.209862 0.127745
2 -3.969982 0.239514 1.229875 0.180391 -0.919360 -0.650638 -0.826445 0.163185
3 1.549729 -3.037566 0.466449 0.394413 0.623392 0.383794 -0.131539 -0.176248
4 -5.468963 4.651385 -1.290061 0.023804 -1.692033 2.510965 -0.315330 0.475009
... ... ... ... ... ... ... ... ...
808 0.368201 -0.641878 -1.481101 0.164090 -0.777381 -0.934650 -0.874360 0.193428
809 0.040917 -0.160848 -0.473839 -0.179208 1.978454 -1.431609 0.279248 -0.302916
810 -5.188919 -0.171319 0.585738 -0.886837 1.348744 0.225891 -0.888525 -0.429704
811 3.321748 -1.094132 -1.930953 0.339361 0.527587 -0.030116 0.265542 0.451123
812 5.012853 0.432697 -1.315713 0.196398 0.167606 0.345863 0.409124 -0.221262

813 rows × 8 columns

In [54]:
#Let us check it visually


sns.pairplot(Proj_data_df, diag_kind='kde') 
Out[54]:
<seaborn.axisgrid.PairGrid at 0x1a27225ad0>
In [55]:
# After dimensionality reduction using PCA our attributes have become independent with no correlation among themselves. 
# As most of them have cloud of data points with no lienaer kind of relationship.
In [56]:
# Let's Fit SVC Model ON Train-test Data:
In [57]:
# split the data
In [58]:
from sklearn import model_selection

test_size = 0.30 # taking 70:30 training and test set
seed = 7  # Random numbmer seeding for reapeatability of the code
#PCA Data
pca_X_train, pca_X_test, pca_y_train, pca_y_test = model_selection.train_test_split(Proj_data_df, y, test_size=test_size, random_state=seed)

#orginal Data
Orig_X_train,Orig_X_test,Orig_y_train,Orig_y_test = train_test_split(X_std,y,test_size=0.30,random_state=seed)
In [59]:
#let's check the split of data
print("{0:0.2f}% data is in training set".format((len(Orig_X_train)/len(Proj_data_df.index)) * 100))
print("{0:0.2f}% data is in test set".format((len(Orig_X_test)/len(Proj_data_df.index)) * 100))
69.99% data is in training set
30.01% data is in test set
In [60]:
print(pca_X_test)
print(Orig_X_test)
            0         1         2         3         4         5         6  \
558 -4.481191  0.866539  1.948918  0.434209 -0.211405 -0.352762 -0.542643   
396  1.964272 -1.263445  0.910392 -0.224778 -0.579020 -0.279106  1.026415   
327  4.660313  2.486299 -0.479277 -1.537521 -0.778268 -0.102202 -0.444303   
278  3.249787 -1.768684 -1.542845  1.228381  0.427296  0.078997  0.288564   
264 -5.362055 -0.995548 -0.561607  0.985304 -0.380091 -1.711219  0.164456   
..        ...       ...       ...       ...       ...       ...       ...   
806  1.840122 -1.930005  1.706548 -0.075728  0.189226  0.009464  0.627631   
177  1.601219 -1.818944 -1.651601 -0.558317  0.145109 -0.117596 -0.689553   
640 -4.737213 -0.987764 -0.135143 -0.241860  0.244474  0.015232 -0.047512   
37   1.642834  0.939965  1.419198 -2.456310 -0.335444 -1.409952  0.089128   
97  -1.546332  0.311510 -2.598904 -1.458681 -1.526858  0.256477  0.312678   

            7  
558 -0.103803  
396  0.158257  
327 -0.361004  
278 -0.857219  
264 -0.620019  
..        ...  
806  0.064693  
177  0.389391  
640 -0.602374  
37   0.582745  
97  -0.982508  

[244 rows x 8 columns]
[[ 1.289602    1.50675698  1.21347634 ... -0.96631344 -0.3247698
   0.04054562]
 [-0.57264742 -0.13071585 -0.78268867 ... -0.18126136  0.65739206
   0.4491852 ]
 [-1.31754719 -1.27694683 -1.81296739 ... -0.74201285 -1.4706253
  -1.73022586]
 ...
 [ 2.03450176  1.3430097   1.4066536  ...  0.37949012  0.16631113
   0.99403796]
 [-1.56584711  0.03303143 -0.91147351 ... -1.41491462 -0.97954437
  -0.91294672]
 [ 1.04130207  0.03303143  0.05441278 ...  1.61314338  0.65739206
  -0.64052033]]
In [61]:
def getAccuracy(testSet, predictions):
	correct = 0
	for x in range(len(testSet)):
		if testSet[x]== predictions[x]:
			correct += 1
	return (correct/float(len(testSet))) * 100.0
In [62]:
from sklearn import svm
svc = SVC() #instantiate the object
#fit the model on orighinal raw data
svc.fit(Orig_X_train,Orig_y_train)
#predict the y value
Orig_y_predict = svc.predict(Orig_X_test)
In [63]:
#now fit the model on pca data with new dimension
svc1 = SVC() #instantiate the object
svc1.fit(pca_X_train,pca_y_train)

#predict the y value
pca_y_predict = svc1.predict(pca_X_test)
In [64]:
print("Model Score On Original Data ",svc.score(Orig_X_test, Orig_y_test))
print("Model Score On Reduced PCA Dimension ",svc1.score(pca_X_test, pca_y_test))

print("Before PCA On Original 18 Dimension",accuracy_score(Orig_y_test,Orig_y_predict))
print("After PCA(On 8 dimension)",accuracy_score(pca_y_test,pca_y_predict))
Model Score On Original Data  0.9631147540983607
Model Score On Reduced PCA Dimension  0.9508196721311475
Before PCA On Original 18 Dimension 0.9631147540983607
After PCA(On 8 dimension) 0.9508196721311475
In [65]:
pca_y_grid = (np.column_stack([pca_y_test, pca_y_predict]))
In [66]:
print(pca_y_grid)
[[1. 1.]
 [0. 0.]
 [1. 1.]
 [2. 2.]
 [1. 1.]
 [1. 1.]
 [0. 0.]
 [0. 0.]
 [1. 1.]
 [1. 1.]
 [2. 2.]
 [1. 1.]
 [1. 1.]
 [1. 1.]
 [2. 2.]
 [1. 1.]
 [0. 0.]
 [1. 1.]
 [1. 1.]
 [1. 1.]
 [0. 0.]
 [2. 2.]
 [1. 1.]
 [1. 1.]
 [2. 2.]
 [1. 1.]
 [2. 2.]
 [2. 2.]
 [1. 1.]
 [1. 1.]
 [0. 0.]
 [1. 1.]
 [1. 1.]
 [0. 0.]
 [2. 2.]
 [1. 1.]
 [1. 1.]
 [1. 1.]
 [2. 2.]
 [0. 0.]
 [1. 1.]
 [2. 1.]
 [1. 1.]
 [0. 0.]
 [2. 2.]
 [1. 1.]
 [1. 1.]
 [2. 1.]
 [2. 2.]
 [1. 1.]
 [1. 1.]
 [0. 0.]
 [2. 2.]
 [1. 1.]
 [1. 1.]
 [1. 1.]
 [1. 1.]
 [1. 1.]
 [0. 0.]
 [1. 1.]
 [2. 2.]
 [1. 1.]
 [2. 2.]
 [1. 1.]
 [2. 2.]
 [1. 1.]
 [1. 0.]
 [0. 0.]
 [2. 2.]
 [0. 0.]
 [2. 2.]
 [1. 1.]
 [1. 1.]
 [0. 0.]
 [2. 2.]
 [1. 1.]
 [1. 1.]
 [2. 2.]
 [0. 0.]
 [2. 2.]
 [0. 0.]
 [1. 1.]
 [0. 0.]
 [1. 1.]
 [1. 1.]
 [2. 2.]
 [1. 1.]
 [1. 1.]
 [1. 1.]
 [1. 1.]
 [1. 1.]
 [2. 2.]
 [1. 1.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [1. 1.]
 [2. 2.]
 [1. 1.]
 [0. 0.]
 [1. 1.]
 [0. 0.]
 [1. 1.]
 [1. 1.]
 [0. 0.]
 [1. 1.]
 [0. 0.]
 [2. 2.]
 [2. 2.]
 [0. 0.]
 [2. 2.]
 [1. 1.]
 [2. 2.]
 [0. 0.]
 [1. 1.]
 [0. 0.]
 [2. 2.]
 [2. 2.]
 [2. 2.]
 [1. 1.]
 [1. 1.]
 [1. 1.]
 [0. 0.]
 [1. 1.]
 [1. 1.]
 [0. 0.]
 [0. 0.]
 [1. 1.]
 [1. 1.]
 [2. 2.]
 [2. 2.]
 [1. 1.]
 [2. 2.]
 [1. 1.]
 [1. 1.]
 [1. 1.]
 [0. 0.]
 [1. 2.]
 [0. 0.]
 [2. 2.]
 [1. 1.]
 [1. 1.]
 [1. 1.]
 [0. 0.]
 [1. 1.]
 [1. 1.]
 [1. 1.]
 [1. 1.]
 [2. 2.]
 [0. 0.]
 [1. 1.]
 [1. 1.]
 [1. 1.]
 [2. 1.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [2. 2.]
 [2. 2.]
 [1. 1.]
 [2. 2.]
 [1. 1.]
 [2. 1.]
 [1. 1.]
 [1. 1.]
 [0. 0.]
 [1. 1.]
 [1. 1.]
 [1. 1.]
 [1. 1.]
 [1. 1.]
 [0. 0.]
 [2. 1.]
 [2. 2.]
 [0. 0.]
 [1. 1.]
 [0. 0.]
 [1. 2.]
 [0. 0.]
 [1. 1.]
 [2. 2.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [1. 1.]
 [1. 1.]
 [1. 1.]
 [0. 0.]
 [1. 1.]
 [2. 2.]
 [1. 1.]
 [1. 1.]
 [0. 0.]
 [2. 2.]
 [1. 2.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [1. 1.]
 [1. 1.]
 [1. 2.]
 [0. 0.]
 [1. 1.]
 [1. 1.]
 [1. 1.]
 [0. 0.]
 [2. 2.]
 [1. 1.]
 [0. 1.]
 [1. 1.]
 [1. 1.]
 [1. 1.]
 [0. 0.]
 [2. 2.]
 [1. 1.]
 [1. 1.]
 [1. 1.]
 [0. 0.]
 [2. 2.]
 [1. 1.]
 [2. 2.]
 [0. 0.]
 [2. 2.]
 [0. 0.]
 [0. 0.]
 [2. 2.]
 [1. 1.]
 [2. 2.]
 [1. 1.]
 [0. 0.]
 [0. 0.]
 [2. 2.]
 [2. 2.]
 [1. 1.]
 [1. 1.]
 [2. 2.]
 [0. 0.]
 [2. 1.]
 [1. 1.]
 [2. 2.]
 [1. 1.]
 [1. 1.]
 [0. 0.]
 [0. 0.]]
In [67]:
np.savetxt("ocr.csv", pca_y_grid , fmt='%s')
In [68]:
import string
lab= list(string.ascii_uppercase[0:26])
plab=["Pr "+s for s in lab]
In [69]:
# Filter those cases where the model committed mistake and analyze the mistake,
# which characters most mistakes occured on?
In [70]:
# from sklearn import metrics
# print(metrics.confusion_matrix(pca_y_test, pca_y_predict,labels=[0, 1]))
# Calculate Confusion Matrix & PLot To Visualize it

def draw_confmatrix(y_test, yhat, str1, str2, str3, datatype ):
    cm = confusion_matrix( y_test, yhat, [0,1,2] )
    print("Confusion Matrix For :", "\n",datatype,cm )
    sns.heatmap(cm, annot=True,  fmt='.2f', xticklabels = [str1, str2,str3] , yticklabels = [str1, str2,str3] )
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()
    

draw_confmatrix(Orig_y_test, Orig_y_predict,"Van ", "Car ", "Bus", "Original Data Set" )

draw_confmatrix(pca_y_test, pca_y_predict,"Van ", "Car ", "Bus", "For Reduced Dimensions Using PCA ")

#Classification Report Of Model built on Raw Data
print("Classification Report For Raw Data:", "\n", classification_report(Orig_y_test,Orig_y_predict))

#Classification Report Of Model built on Principal Components:

print("Classification Report For PCA:","\n", classification_report(pca_y_test,pca_y_predict))
Confusion Matrix For : 
 Original Data Set [[ 62   0   1]
 [  0 119   3]
 [  0   5  54]]
Confusion Matrix For : 
 For Reduced Dimensions Using PCA  [[ 62   1   0]
 [  1 117   4]
 [  0   6  53]]
Classification Report For Raw Data: 
               precision    recall  f1-score   support

         0.0       1.00      0.98      0.99        63
         1.0       0.96      0.98      0.97       122
         2.0       0.93      0.92      0.92        59

    accuracy                           0.96       244
   macro avg       0.96      0.96      0.96       244
weighted avg       0.96      0.96      0.96       244

Classification Report For PCA: 
               precision    recall  f1-score   support

         0.0       0.98      0.98      0.98        63
         1.0       0.94      0.96      0.95       122
         2.0       0.93      0.90      0.91        59

    accuracy                           0.95       244
   macro avg       0.95      0.95      0.95       244
weighted avg       0.95      0.95      0.95       244

In [71]:
##### confusion matrix On original observation
#    - Our model on original data set has correctly classified 63 van out of 63 actuals vans and has errored only in one case where it has wrongly predicted van to be a bus.  
#    - IN case of 127 actual cars our svm model has correcly classified 122 cars. it has wrongly classified 3 cars to be a bus and also 1 car to be a van
#    - In case of 61 instances of actual bus , our model has correctly classified 53 buses , It has faltered in classifying wrongly 0 buses to be a van and one bus to be a car. 


# on PCA
# - Out of 66 actual instances of vans our model has correctly predicted 62 vans and errored in 1 instances where it wrongly classified vans to be a car. 
# - Out of 127 actuals cars , our mdoel has correclty classified 117 of them to be a  car and faltered in 4 cases where it wrongly classified.

# - Out of 61 actual bus , our model has correclty classified 53 of them to be a bus. It has faltered in 6 cases where it wrongly classified.
In [72]:
# Let us build a linear regression model on the PCA dimensions 

# Import Linear Regression machine learning library
from sklearn.linear_model import LinearRegression

regression_model = LinearRegression()
# on pca
regression_model.fit(pca_X_train, pca_y_train)
regression_model.coef_
Out[72]:
array([ 0.04090819, -0.07605928,  0.12626454,  0.30841616,  0.29301096,
       -0.0495768 , -0.14523604, -0.23055271])
In [73]:
print("with pca data: intercept", regression_model.intercept_)
with pca data: intercept 0.9715417749414592
In [74]:
print("PCA score: ", regression_model.score(pca_X_test, pca_y_test))
PCA score:  0.5417950979991832
In [75]:
# #on original
regression_model.fit(Orig_X_train, Orig_y_train)
print("with original data coef: ", regression_model.coef_)
print("with original data: intercept", regression_model.intercept_)
print("Original score: ", regression_model.score(Orig_X_test, Orig_y_test))
with original data coef:  [ 0.12256079 -0.34374244  0.36722453  0.43057055 -0.2396492   0.2575449
 -0.44130646  1.13029559  0.09675039  0.44023939  0.27912788  0.09260166
 -0.05533061  0.04077368  0.04506528 -0.05961154 -0.16458783  0.18412199]
with original data: intercept 0.9783123113728347
Original score:  0.6786314628377199
In [76]:
# Lessons -

# 1. Uses PCA only when the original dimensions have linear relations. The original dimensions had negative curvilinear relations
# 2. Remove outliers before doing PCA. We have significant outliers which are due to mix up of the gaussians in original dimension

# Suggestion -

# 1. Segment the original data based on observations using K Means clustering
# 2. Remove the outliers from the segments
# 2. If the original dimensions show strong linear relations in the segments, then apply PCA
In [77]:
#Observation:

# Model Score On Original Data  0.9631147540983607
# Model Score On Reduced PCA Dimension  0.9508196721311475
# Before PCA On Original 18 Dimension 0.9631147540983607
# After PCA(On 8 dimension) 0.9508196721311475
# On training data set we saw that our support vector classifier without performing PCA has an accuracy score of 95 %
# But when we applied the SVC model on PCA(reduced dimensions) our model scored 96 %.
# Considering that original dataframe had 18 dimensions and After PCA dimension reduced to 8,
#our model has fared well in terms of accuracy score. But it's in lesser dimension hence we can consider PCA here it really
#can afford the loss of data from those other dimensions.


# I tried the both PCA and original data on logistic regression as well. Hence observation there:
#     PCA score:  0.5417950979991832
#     Original score:  0.6786314628377199
#         so model is performing very poor with both orioginal and pca hence we will not suggest to use logistic regression.
#         I think SVM (SVC) is good way.